import os.path

from data_read import open_file
import jsonlines

import matplotlib.pyplot as plt  # 导入pyplot模块

def do_data_clean():
    """"""
    res_list = []
    comment_len = (0,2500)
    title_set = ["很差",
                 "较差",
                 "还行",
                 "推荐",
                 "力荐", ]
    dict_lines = open_file('./output/data.jsonl')
    for line in dict_lines:
        if line['title'] in title_set and len(line['comment']) >= comment_len[0] and len(line['comment']) <= comment_len[1]:
            res_list.append(line)
    if os.path.exists('./output/clean_data.jsonl'):
        os.remove('./output/clean_data.jsonl')
    with jsonlines.open('./output/clean_data.jsonl', mode='a') as f:
        for res in res_list:
            f.write(res)
    print('数据清洗完成, 清洗过的数据保存在./output/clean_data.jsonl, 长度为:', len(res_list))


def show_comments_lens():
    """"""
    dict_lines = open_file('./output/clean_data.jsonl')
    comment_lens = []
    for line in dict_lines:
        comment_lens.append(len(line['comment']))
    print(comment_lens)
    plt.hist(comment_lens, bins=[0,  10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400])  # 调用hist()方法，传入数据和区间
    plt.xlabel('Value')  # 设置x轴标签
    plt.ylabel('Frequency')  # 设置y轴标签
    plt.title('comments length')  # 设置标题
    plt.show()  # 显示图形

if __name__ == '__main__':
    do_data_clean()
    # show_comments_lens()
